Experiment notes

  • Cell suspensions of meningeal dura enriched by MACS for cells expressing Cd45, Cd31, and Lyve-1
    • Cd45/Ptprc/B220/Ly-5/Lyt-4/T200: transmembrane protein tyrosine phosphatase, located on most haematopoietic cells, positive selection of leukocytes
    • Cd31/Pecam1: adhesion and signaling receptor that is expressed on endothelial and hematopoietic cells, positive selection of endothelial cells
    • Lyve1/1200012G08Rik/Lyve-1/Xlkd1/lymphatic vessel endothelial HA receptor-1: found primarily on lymphatic endothelial cells
  • The meningeal layer of the dura mater is a durable, dense fibrous membrane that passes through the foramen magnum and is continuous with the dura mater of the spinal cord. The meningeal layer of the dura mater creates several dural folds that divide the cranial cavity into freely communicating spaces.
  • Aimed for ~5000 cells per sample to be sequenced at least 50,000 reads per cell
    • Might ended up with more because we sequenced with the NovaSeq S2
  • Expecting to see a lot of leukocytes and vascular cells
  • The purpose is to check the effect of sex, Apoe isoform expression and age (and respective interactions) on the transcriptomes of the different cell subpopulations
  • 8 experimental groups/samples
    • E3_2M_F = APOE3 expressing; 2 months old; female
    • E3_14M_F = APOE3 expressing; 14 months old; female
    • E3_2M_M = APOE3 expressing; 2 months old; male
    • E3_14M_M = APOE3 expressing; 14 months old; male
    • E4_2M_F = APOE4 expressing; 2 months old; female
    • E4_14M_F = APOE4 expressing; 14 months old; female
    • E4_2M_M = APOE4 expressing; 2 months old; male
    • E4_14M_M = APOE4 expressing; 14 months old; male
  • The analysis was subset to only 14 month old mice (4 samples) after the first pass was viewed

Setup

Set working directory

knitr::opts_knit$set(root.dir = ".")

Load libraries

library(cowplot)     # plot_grid()
library(dplyr)       # left_join()
library(ggplot2)     # ggplot()
library(gridExtra)   # grid.arrange()
library(parallel)    # detectCores()
library(rtracklayer) # import()
library(scCustomize) # Merge_Seurat_List()
library(Seurat)      # Read10X_h5()
library(stringr)     # str_match()

Variables and functions

# variables
out <- "../../results/pass_1/all_clusters/"
sample_order <- c("E3.M","E3.F","E4.M","E4.F")
sample_colors <- c("#26946A","#1814A1","#EAC941","#DF5F00")
sample_order2 <- c("Male E3", "Male E4", "Female E3", "Female E4")
isoform_order <- c("E4","E3")
isoform_colors <- c("darkgray","cornflowerblue")
sex_order <- c("Male","Female")
sex_colors <- c("darkgray","purple")

# thresholds
nCount.min <- 500
nCount.max <- 20000
nFeature.min <- 250
complexity.cutoff <- 0.8
mt.cutoff <- 20
ribo.cutoff <- 50
hb.cutoff <- 3

# functions
source("../../../functions/Kennedi_single_cell_functions.R")
saveToPDF <- function(...) {
    d = dev.copy(pdf,...)
    dev.off(d)
}

Load data

Merge h5

prefix <- "../../counts/"
suffix <- "/outs/filtered_feature_bc_matrix.h5"

if (file.exists("../../rObjects/raw_h5.rds")) {
  mouse <- readRDS("../../rObjects/raw_h5.rds")
} else {
  # individual sample objects
  E3.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_F",suffix)))
  E3.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E3_14M_M",suffix)))
  E4.F <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_14M_F",suffix)))
  E4.M <- CreateSeuratObject(Read10X_h5(paste0(prefix,"E4_14M_M",suffix)))

  # merge objects
  mouse <- merge(x = E3.F, 
                 y = c(E3.M,E4.F,E4.M),
                 add.cell.ids = c("E3.F","E3.M","E4.F","E4.M"), 
                 project = "Mouse Meningeal Dura scRNAseq")
  
  # cleanup and save
  remove(E3.F, E3.M, E4.F, E4.M)
  saveRDS(mouse, "../../rObjects/raw_h5.rds")
}

# preview
mouse
## An object of class Seurat 
## 32285 features across 33388 samples within 1 assay 
## Active assay: RNA (32285 features, 0 variable features)
##  4 layers present: counts.1, counts.2, counts.3, counts.4

Annotation

  • Annotation file was downloaded from 10x Genomics
    • refdata-gex-mm10-2020-A
    • provider: GENCODE
    • description: evidence-based annotation of the mouse genome (GRCm38), version M23 (Ensembl 98)
  • Gm* genes are originally annotated by MGI and the *Rik genes are annotated by RIKEN
# read in annotation file, GENCODE GRCm38 version M23 (Ensembl 98)
if (file.exists("../../rObjects/annotation.rds")) {
  genes <- readRDS("../../rObjects/annotation.rds")
} else {
  gtf.file <- "../../refs/mouse_genes.gtf"
  genes <- rtracklayer::import(gtf.file)
  genes <- as.data.frame(genes)
  genes <- genes[genes$type == "gene",]
  saveRDS(genes, "../../rObjects/annotation.rds")
}

Metadata columns

# create sample column
barcodes <- colnames(mouse)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[,2]
table(sample)
## sample
##  E3.F  E3.M  E4.F  E4.M 
##  6747  8029 11204  7408
mouse$sample <- factor(sample, levels = sample_order)
table(mouse$sample)  # check
## 
##  E3.M  E3.F  E4.M  E4.F 
##  8029  6747  7408 11204
Idents(mouse) <- mouse$sample

# age column
mouse$age <- "14 months"

# sex column
sex <- str_match(mouse$sample, "E\\d.([FM])")[,2]
sex <- gsub("F","Female",sex)
sex <- gsub("M","Male",sex)
mouse$sex <- factor(sex, levels = sex_order)

# Apoe isoform column
isoform <- str_match(mouse$sample, "(E\\d).[FM]")[,2]
mouse$isoform <- factor(isoform, levels = isoform_order)

# sample2 column (for title formatting)
mouse$sample2 <- paste(mouse$sex, mouse$isoform, sep = " ")

# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)

# percent.mt
mt.genes <- genes[genes$seqnames == "chrM",13]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3" 
##  [8] "mt-Nd3"  "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl'
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
##   [1] "Mrpl15"     "Mrpl30"     "Mrps9"      "Mrpl44"     "Mrps14"    
##   [6] "Mrpl41"     "Mrps2"      "Mrps5"      "Mrps26"     "Mrps28"    
##  [11] "Mrpl47"     "Mrpl24"     "Mrpl9"      "Mrps21"     "Mrpl50"    
##  [16] "Mrpl37"     "Mrps15"     "Mrpl20"     "Mrpl33"     "Mrpl1"     
##  [21] "Mrps18c"    "Mrps17"     "Mrps33"     "Mrpl35"     "Mrpl19"    
##  [26] "Mrpl53"     "Mrps25"     "Mrpl51"     "Mrps35"     "Mrps12"    
##  [31] "Mrpl46"     "Mrps11"     "Mrpl48"     "Mrpl17"     "Mrpl23"    
##  [36] "Mrps31"     "Mrpl34"     "Mrpl4"      "Mrps22"     "Mrpl3"     
##  [41] "Mrpl54"     "Mrpl42"     "Mrps24"     "Mrpl22"     "Mrpl55"    
##  [46] "Mrps23"     "Mrpl27"     "Mrpl10"     "Mrpl45"     "Mrpl58"    
##  [51] "Mrps7"      "Mrpl38"     "Mrpl12"     "Mrpl32"     "Mrpl36"    
##  [56] "Mrps27"     "Mrps36"     "Mrps30"     "Mrps16"     "Mrpl52"    
##  [61] "Mrpl57"     "Mrpl13"     "Mrpl40"     "Mrpl39"     "Mrps6"     
##  [66] "Mrpl18"     "Mrps34"     "Mrpl28"     "Mrps18b"    "Mrpl14"    
##  [71] "Mrps18a"    "Mrpl2"      "Mrps10"     "Mrpl21"     "Mrpl11"    
##  [76] "Mrpl49"     "Mrpl16"     "Mrpl43"     "Rpl7"       "Rpl31"     
##  [81] "Rpl37a"     "Rps6kc1"    "Rpl7a"      "Rpl12"      "Rpl35"     
##  [86] "Rps21"      "Rpl22l1"    "Rps3a1"     "Rps27"      "Rpl34"     
##  [91] "Rps20"      "Rps6"       "Rps8"       "Rps6ka1"    "Rpl11"     
##  [96] "Rpl22"      "Rpl9"       "Rpl5"       "Rplp0"      "Rpl6"      
## [101] "Rpl21"      "Rpl32"      "Rps9"       "Rpl28"      "Rps5"      
## [106] "Rps19"      "Rps16"      "Rps11"      "Rpl13a"     "Rpl18"     
## [111] "Rps17"      "Rps3"       "Rpl27a"     "Rps13"      "Rps15a"    
## [116] "Rplp2"      "Rpl18a"     "Rpl13"      "Rps25"      "Rpl10-ps3" 
## [121] "Rplp1"      "Rpl4"       "Rps27l"     "Rpl29"      "Rps27rt"   
## [126] "Rpsa"       "Rpl14"      "Rps12"      "Rps15"      "Rpl41"     
## [131] "Rps26"      "Rps27a"     "Rpl26"      "Rpl23a"     "Rpl9-ps1"  
## [136] "Rps6kb1"    "Rpl23"      "Rpl19"      "Rpl27"      "Rpl38"     
## [141] "Rps7"       "Rpl10l"     "Rps29"      "Rpl36al"    "Rps6kl1"   
## [146] "Rps6ka5"    "Rps23"      "Rpl15"      "Rps24"      "Rpl36a-ps1"
## [151] "Rpl37"      "Rpl30"      "Rpl8"       "Rpl3"       "Rps19bp1"  
## [156] "Rpl39l"     "Rpl35a"     "Rpl24"      "Rps6ka2"    "Rps2"      
## [161] "Rpl3l"      "Rps10"      "Rpl10a"     "Rps28"      "Rps18"     
## [166] "Rpl7l1"     "Rpl36"      "Rpl36-ps4"  "Rps14"      "Rpl17"     
## [171] "Rps6kb2"    "Rps6ka4"    "Rpl9-ps6"   "Rpl39"      "Rpl10"     
## [176] "Rps4x"      "Rps6ka6"    "Rpl36a"     "Rps6ka3"
# percent.hb
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bt"  "Hbb-bs"  "Hbb-bh2" "Hbb-bh1" "Hbb-y"   "Hba-x"   "Hba-a1" 
## [8] "Hba-a2"

Pre-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")

ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,12000, by = 2000), limits = c(0,12000)) +
  ggtitle("Raw: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
  geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Violin plots

# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v1

#  percent violins
v2 <- VlnPlot(mouse,
              features = c("percent.mt","percent.ribo.protein","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
v2

Scatter plots

s1 <- ggplot(
  mouse@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") + 
  geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula = 'y ~ x'

s2 <- FeatureScatter(mouse,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
s2

Filtering

Cell-level filtering

# filter
mouse.filtered <- subset(mouse,
                         subset = (nCount_RNA > nCount.min) &
                           (nCount_RNA < nCount.max) &
                           (nFeature_RNA > nFeature.min) &
                           (cell.complexity > complexity.cutoff) &
                           (percent.mt < mt.cutoff) &
                           (percent.hb < hb.cutoff))

# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "12151 cells removed"

Gene-level filtering

Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.

# filter genes
mouse.filtered <- JoinLayers(mouse.filtered)
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
nonzero <- counts > 0  # produces logical
keep <- Matrix::rowSums(nonzero) >= 10  # sum the true/false
counts.filtered <- counts[keep,]  # keep certain genes

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered, 
                                     meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "10587 features removed"

Remove highly abundant mitochondrial transcripts.

# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]

# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
                                     meta.data = mouse.filtered@meta.data)

# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "13 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero)

Post-filtering QC

Number of cells

# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")

ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) + 
  geom_col() +
  theme_classic() +
  geom_text(aes(label = frequency), 
            position=position_dodge(width=0.9), 
            vjust=-0.25) +
  scale_fill_manual(values = sample_colors) + 
  scale_y_continuous(breaks = seq(0,12000, by = 2000), limits = c(0,12000)) +
  ggtitle("Filtered: cells per sample") +
  theme(legend.position =  "none") + 
  theme(axis.text.x = element_text(angle = 45, hjust=1))

# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Density plots

# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nCount_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("nCount_RNA") +
  ylab("Density") +
  theme(legend.position =  "none") +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
  geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = nFeature_RNA,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("nFeature_RNA") +
  ylab("Density") +
  geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = cell.complexity,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("Cell Complexity (log10(nFeature/nCount))") +
  ylab("Density") +
  geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.mt,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_continuous(n.breaks = 4) +
  geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Mitochondrial Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.ribo.protein,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Ribosomal Protein Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
       aes(color = sample,
           x = percent.hb,
           fill = sample)) +
  geom_density(alpha = 0.2) +
  theme_classic() +
  scale_x_log10() +
  geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
  scale_color_manual(values = sample_colors) +
  theme(legend.position =  "none") +
  scale_fill_manual(values = sample_colors) +
  xlab("% Hemoglobin Genes") +
  ylab("Density") +
  theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))

# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)

Violin plots

# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
              features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
v3

#  percent violins
v4 <- VlnPlot(mouse.filtered,
              features = c("percent.mt","percent.ribo.protein","percent.hb"),
              ncol = 3,
              group.by = 'sample',
              cols = sample_colors,
              pt.size = 0)
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
v4

Scatter plots

s3 <- ggplot(
  mouse.filtered@meta.data,
  aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) + 
  geom_point() + 
  geom_smooth(method = "lm") +
    scale_x_log10() +       
  scale_y_log10() + 
  theme_classic() +
  geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") + 
  geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") + 
  facet_wrap(~sample) +
  scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

s4 <- FeatureScatter(mouse.filtered,
                     feature1 = "nCount_RNA",
                     feature2 = "percent.mt",
                     group.by = 'sample',
                     cols = sample_colors,
                     shuffle = TRUE)
s4

Box plot

# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
       aes(x = sample, 
           y = log10(nFeature_RNA), 
           fill=sample)) + 
  geom_boxplot() + 
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
  ggtitle("Unique Genes / Cell / Sample") +
  scale_color_manual(values = sample_colors) +
  scale_fill_manual(values = sample_colors) +
  xlab("Sample")
b1

Top transcripts

  • Gm42418: Rn18s-rs5
df <- data.frame(gene_name = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df <- df[order(df$rsum, decreasing = TRUE),]
rownames(df) <- 1:nrow(df)
head(df, 10)
##    gene_name     rsum
## 1    Gm42418 11194504
## 2     Malat1  3490427
## 3      Cmss1   741422
## 4     Tmsb4x   732027
## 5     S100a9   630464
## 6     Camk1d   590094
## 7     S100a8   456591
## 8       Cd74   433221
## 9       Igkc   426296
## 10  AY036118   396173

Unwanted variation

Cell cycle

markers <- "../../../functions/cell_cycle_markers.tsv"
path <- paste0(out, "unwanted_variation")
mouse.filtered[["phase"]] <- cell_cycle_QC(obj = mouse.filtered,
                                           markersPath = markers,
                                           species = "mouse",
                                           outDir = path)

Mitochondria

mouse.filtered[["mito.factor"]] <- mitochondria_QC(obj = mouse.filtered,
                                                   outDir = path)

Cluster

SCTransform

  • SCTransform method is a more accurate method of normalizing, estimating the variance of the raw filtered data, and identifying the most variable genes.
  • Variation in sequencing depth (total nCount_RNA per cell) is normalized using a regularized negative binomial model.
  • If there are other sources of uninteresting variation it can be included.
# split
mouse.filtered[["RNA"]] <- split(mouse.filtered[["RNA"]], 
                                 f = mouse.filtered$sample)

# transform
mouse.filtered <- SCTransform(mouse.filtered, verbose = FALSE)
  • NOTE: By default, after normalizing, adjusting the variance, and regressing out uninteresting sources of variation, SCTransform will rank the genes by residual variance and output the 3000 most variant genes. If the data set has larger cell numbers, then it may be beneficial to adjust this parameter higher using the variable.features.n argument. Additionally the last line of output specifies “Set default assay to SCT”.
  • It is suggested to not regress out batch, and instead use a data integration method like Harmony

PCA

# run PCA on the merged object
mouse.filtered <- RunPCA(object = mouse.filtered, assay = "SCT")
# Reset idents and levels
DefaultAssay(mouse.filtered) <- "SCT"
Idents(mouse.filtered) <- "sample"
mouse.filtered$sample <- factor(mouse.filtered$sample, 
                              levels = sample_order)
mouse.filtered$sex <- factor(mouse.filtered$sex, 
                           levels = sex_order)
mouse.filtered$isoform <- factor(mouse.filtered$isoform,
                               levels = isoform_order)

# Plot PCA
pca1 <- DimPlot(mouse.filtered,
                reduction = "pca",
                split.by = "sample",
                group.by = "sample",
                cols = sample_colors,
                ncol = 2)
pca1

pca2 <- DimPlot(mouse.filtered,
                reduction = "pca",
                split.by = "isoform",
                group.by = "isoform",
                cols = isoform_colors)
pca2

pca3 <- DimPlot(mouse.filtered,
                reduction = "pca",
                split.by = "sex",
                group.by = "sex",
                cols = sex_colors)
pca3

pca4 <- DimPlot(mouse.filtered,
                reduction = "pca",
                group.by = "sample",
                cols = sample_colors,
                shuffle = TRUE,
                raster = FALSE)
pca4

e1 <- ElbowPlot(mouse.filtered) +
  geom_vline(xintercept = 15, linetype = "dashed", colour = "red")
e1

Harmony

# integrate
mouse.filtered <- IntegrateLayers(mouse.filtered,
                                  method = "HarmonyIntegration",
                                  assay = "SCT",
                                  orig.reduction = "pca")
## Warning: HarmonyMatrix is deprecated and will be removed in the future from the
## API in the future
## Warning: Warning: The parameters do_pca and npcs are deprecated. They will be ignored for this function call and please remove parameters do_pca and npcs and pass to harmony cell_embeddings directly.
## This warning is displayed once per session.
## Warning: Warning: The parameter tau is deprecated. It will be ignored for this function call and please remove parameter tau in future function calls. Advanced users can set value of parameter tau by using parameter .options and function harmony_options().
## This warning is displayed once per session.
## Warning: Warning: The parameter block.size is deprecated. It will be ignored for this function call and please remove parameter block.size in future function calls. Advanced users can set value of parameter block.size by using parameter .options and function harmony_options().
## This warning is displayed once per session.
## Warning: Warning: The parameter max.iter.harmony is replaced with parameter max_iter. It will be ignored for this function call and please use parameter max_iter in future function calls.
## This warning is displayed once per session.
## Warning: Warning: The parameter max.iter.cluster is deprecated. It will be ignored for this function call and please remove parameter max.iter.cluster in future function calls. Advanced users can set value of parameter max.iter.cluster by using parameter .options and function harmony_options().
## This warning is displayed once per session.
## Warning: Warning: The parameter epsilon.cluster is deprecated. It will be ignored for this function call and please remove parameter epsilon.cluster in future function calls. Advanced users can set value of parameter epsilon.cluster by using parameter .options and function harmony_options().
## This warning is displayed once per session.
## Warning: Warning: The parameter epsilon.harmony is deprecated. It will be ignored for this function call and please remove parameter epsilon.harmony in future function calls. If users want to control if harmony would stop early or not, use parameter early_stop. Advanced users can set value of parameter epsilon.harmony by using parameter .options and function harmony_options().
## This warning is displayed once per session.
## Transposing data matrix
## Using automatic lambda estimation
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony 4/10
## Harmony converged after 4 iterations
# re-join layers
mouse.filtered[["RNA"]] <- JoinLayers(mouse.filtered[["RNA"]])
hrm1 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                split.by = "sample",
                group.by = "sample",
                cols = sample_colors,
                ncol = 2)
hrm1

hrm2 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                split.by = "isoform",
                group.by = "isoform",
                cols = isoform_colors)
hrm2

hrm3 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                split.by = "sex",
                group.by = "sex",
                cols = sex_colors)
hrm3

hrm4 <- DimPlot(mouse.filtered,
                reduction = "harmony",
                group.by = "sample",
                cols = sample_colors,
                shuffle = TRUE,
                raster = FALSE)
hrm4

# Plot elbow
mouse.filtered@reductions$harmony@stdev <-
  apply(mouse.filtered@reductions$harmony@cell.embeddings, 2, sd)
e1 <- ElbowPlot(mouse.filtered,
                reduction = "harmony") +
  geom_vline(xintercept = 15, linetype = "dashed", color = "red")
e1

UMAP

To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.

# run UMAP
mouse.filtered <- RunUMAP(mouse.filtered,
                          dims = 1:15,
                          reduction = "harmony",
                          assay = "SCT",
                          n.components = 3) # set to 3 to use with VR
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 13:14:49 UMAP embedding parameters a = 0.9922 b = 1.112
## 13:14:49 Read 21237 rows and found 15 numeric columns
## 13:14:49 Using Annoy for neighbor search, n_neighbors = 30
## 13:14:49 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 13:14:52 Writing NN index file to temp file /tmp/RtmpWK4VsO/file349a4c23de1b8a
## 13:14:52 Searching Annoy index using 1 thread, search_k = 3000
## 13:15:00 Annoy recall = 100%
## 13:15:01 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 13:15:03 Initializing from normalized Laplacian + noise (using RSpectra)
## 13:15:04 Commencing optimization for 200 epochs, with 877344 positive edges
## 13:15:13 Optimization finished
# plot UMAP
DimPlot(mouse.filtered,
        cols = sample_colors,
        shuffle = TRUE)

Clusters

Seurat uses a graph-based clustering approach, which embeds cells in a graph structure, using a K-nearest neighbor (KNN) graph (by default), with edges drawn between cells with similar gene expression patterns. Then, it attempts to partition this graph into highly interconnected ‘quasi-cliques’ or ‘communities’ [Seurat - Guided Clustering Tutorial].

We will use the FindClusters() function to perform the graph-based clustering. The resolution is an important argument that sets the “granularity” of the downstream clustering and will need to be optimized for every individual experiment. For datasets of 3,000 - 5,000 cells, the resolution set between 0.4-1.4 generally yields good clustering. Increased resolution values lead to a greater number of clusters, which is often required for larger datasets.

The FindClusters() function allows us to enter a series of resolutions and will calculate the “granularity” of the clustering. This is very helpful for testing which resolution works for moving forward without having to run the function for each resolution.

# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.filtered,
                                   assay = "SCT", # set as default after SCTransform
                                   reduction = "harmony",
                                   dims = 1:15)

# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
                                  algorithm = 1, # 1 = Louvain
                                  resolution = seq(0.1, 0.5, by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9777
## Number of communities: 12
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9677
## Number of communities: 15
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9601
## Number of communities: 18
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9534
## Number of communities: 21
## Elapsed time: 2 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 21237
## Number of edges: 656664
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9475
## Number of communities: 26
## Elapsed time: 2 seconds
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.5

Explore resolutions

# 0.1
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.1",
        label = TRUE)

# 0.2
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.2",
        label = TRUE)

# 0.3
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.3",
        label = TRUE)

# 0.4
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.4",
        label = TRUE)

# 0.5
DimPlot(mouse.unannotated,
        group.by = "SCT_snn_res.0.5",
        label = TRUE)